# Install and load two new packages:
library(ggplot2)
library(maps)
library(ggmap)

# Load our data:
mvt = read.csv("D:/Data/mvt.csv", stringsAsFactors=FALSE)

# Convert the Date variable to a format that R will recognize:
mvt$Date = strptime(mvt$Date, format="%m/%d/%y %H:%M")

# Extract the hour and the day of the week:
mvt$Weekday = weekdays(mvt$Date)
mvt$Hour = mvt$Date$hour

str(mvt)
## 'data.frame':    191641 obs. of  5 variables:
##  $ Date     : POSIXlt, format: "2012-12-31 23:15:00" "2012-12-31 22:00:00" ...
##  $ Latitude : num  41.8 41.9 42 41.8 41.8 ...
##  $ Longitude: num  -87.6 -87.7 -87.8 -87.7 -87.6 ...
##  $ Weekday  : chr  "Monday" "Monday" "Monday" "Monday" ...
##  $ Hour     : int  23 22 22 22 21 20 20 20 19 18 ...
# Create a simple line plot - need the total number of crimes on each day of the week. We can get this information by creating a table:
table(mvt$Weekday)
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##     29284     27397     27118     26316     27319     26791     27416
# Save this table as a data frame:
WeekdayCounts = as.data.frame(table(mvt$Weekday))

# Create our plot
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1))  

# Make the "Var1" variable an ORDERED factor variable
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday"))

# Try again:
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1))

# Change our x and y labels:
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1), alpha=0.3) + xlab("Day of the Week") + ylab("Total Motor Vehicle Thefts")

# Create a counts table for the weekday and hour:
table(mvt$Weekday, mvt$Hour)
##            
##                0    1    2    3    4    5    6    7    8    9   10   11
##   Friday    1873  932  743  560  473  602  839 1203 1268 1286  938  822
##   Monday    1900  825  712  527  415  542  772 1123 1323 1235  971  737
##   Saturday  2050 1267  985  836  652  508  541  650  858 1039  946  789
##   Sunday    2028 1236 1019  838  607  461  478  483  615  864  884  787
##   Thursday  1856  816  696  508  400  534  799 1135 1298 1301  932  731
##   Tuesday   1691  777  603  464  414  520  845 1118 1175 1174  948  786
##   Wednesday 1814  790  619  469  396  561  862 1140 1329 1237  947  763
##            
##               12   13   14   15   16   17   18   19   20   21   22   23
##   Friday    1207  857  937 1140 1165 1318 1623 1652 1736 1881 2308 1921
##   Monday    1129  824  958 1059 1136 1252 1518 1503 1622 1815 2009 1490
##   Saturday  1204  767  963 1086 1055 1084 1348 1390 1570 1702 2078 1750
##   Sunday    1192  789  959 1037 1083 1160 1389 1342 1706 1696 2079 1584
##   Thursday  1093  752  831 1044 1131 1258 1510 1537 1668 1776 2134 1579
##   Tuesday   1108  762  908 1071 1090 1274 1553 1496 1696 1816 2044 1458
##   Wednesday 1225  804  863 1075 1076 1289 1580 1507 1718 1748 2093 1511
# Save this to a data frame:
DayHourCounts = as.data.frame(table(mvt$Weekday, mvt$Hour))

# Convert the second variable, Var2, to numbers and call it Hour:
DayHourCounts$Hour = as.numeric(as.character(DayHourCounts$Var2))

# Create out plot:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1))

# Change the colors
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Var1), size=2)

# Separate the weekends from the weekdays:
DayHourCounts$Type = ifelse((DayHourCounts$Var1 == "Sunday") | (DayHourCounts$Var1 == "Saturday"), "Weekend", "Weekday")

# Redo our plot, this time coloring by Type:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2) 

# Make the lines a little transparent:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2, alpha=0.5) 

# Fix the order of the days:
DayHourCounts$Var1 = factor(DayHourCounts$Var1, ordered=TRUE, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

# Make a heatmap:
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq))

# Change the label on the legend, and get rid of the y-label:
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name="Total MV Thefts") + theme(axis.title.y = element_blank())

# Change the color scheme
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name="Total MV Thefts", low="white", high="red") + theme(axis.title.y = element_blank())

#a map of Chicago into R:
chicago = get_map(location = "chicago", zoom = 11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=chicago&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=chicago&sensor=false
# Look at the map
ggmap(chicago)

# Plot the first 100 motor vehicle thefts:
ggmap(chicago) + geom_point(data = mvt[1:100,], aes(x = Longitude, y = Latitude))
## Warning: Removed 7 rows containing missing values (geom_point).

# Round our latitude and longitude to 2 digits of accuracy, and create a crime counts data frame for each area:
LatLonCounts = as.data.frame(table(round(mvt$Longitude,2), round(mvt$Latitude,2)))

str(LatLonCounts)
## 'data.frame':    1638 obs. of  3 variables:
##  $ Var1: Factor w/ 42 levels "-87.93","-87.92",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Var2: Factor w/ 39 levels "41.64","41.65",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq: int  0 0 0 0 0 0 0 0 0 0 ...
# Convert our Longitude and Latitude variable to numbers:
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))

# Plot these points on our map:
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size=Freq))
## Warning: Removed 615 rows containing missing values (geom_point).

# Change the color scheme:
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size=Freq)) + scale_colour_gradient(low="yellow", high="red")
## Warning: Removed 615 rows containing missing values (geom_point).

# We can also use the geom_tile geometry
ggmap(chicago) + geom_tile(data = LatLonCounts, aes(x = Long, y = Lat, alpha = Freq), fill="red")

LatLonCounts2 <- subset(LatLonCounts,Freq>0)


# Load our data:
murders= read.csv("D:/Data/murders.csv", stringsAsFactors=FALSE)

# Change the color scheme:
ggmap(chicago) + geom_point(data = LatLonCounts2, aes(x = Long, y = Lat, color = Freq, size=Freq)) + scale_colour_gradient(low="yellow", high="red")
## Warning: Removed 119 rows containing missing values (geom_point).

str(murders)
## 'data.frame':    51 obs. of  6 variables:
##  $ State            : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Population       : int  4779736 710231 6392017 2915918 37253956 5029196 3574097 897934 601723 19687653 ...
##  $ PopulationDensity: num  94.65 1.26 57.05 56.43 244.2 ...
##  $ Murders          : int  199 31 352 130 1811 117 131 48 131 987 ...
##  $ GunMurders       : int  135 19 232 93 1257 65 97 38 99 669 ...
##  $ GunOwnership     : num  0.517 0.578 0.311 0.553 0.213 0.347 0.167 0.255 0.036 0.245 ...
# Load the map of the US
statesMap = map_data("state")

# Plot the map:
ggplot(statesMap, aes(x = long, y = lat, group = group)) + geom_polygon(fill = "white", color = "black") 

# Create a new variable called region with the lowercase names to match the statesMap:
murders$region = tolower(murders$State)

# Join the statesMap data and the murders data into one dataframe:
murderMap = merge(statesMap, murders, by="region")
str(murderMap)
## 'data.frame':    15537 obs. of  12 variables:
##  $ region           : chr  "alabama" "alabama" "alabama" "alabama" ...
##  $ long             : num  -87.5 -87.5 -87.5 -87.5 -87.6 ...
##  $ lat              : num  30.4 30.4 30.4 30.3 30.3 ...
##  $ group            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ subregion        : chr  NA NA NA NA ...
##  $ State            : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ Population       : int  4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 4779736 ...
##  $ PopulationDensity: num  94.7 94.7 94.7 94.7 94.7 ...
##  $ Murders          : int  199 199 199 199 199 199 199 199 199 199 ...
##  $ GunMurders       : int  135 135 135 135 135 135 135 135 135 135 ...
##  $ GunOwnership     : num  0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 0.517 ...
# Plot the number of murder on our map of the United States:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = Murders)) + geom_polygon(color = "black") + scale_fill_gradient(low = "blue", high = "red", guide = "legend")

# Create a new variable that is the number of murders per 100,000 population:
murderMap$MurderRate = murderMap$Murders / murderMap$Population * 100000

# Plot the number of murder on our map of the United States:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = MurderRate)) + geom_polygon(color = "black") + scale_fill_gradient(low = "blue", high = "red", guide = "legend")

# Redo the plot, removing any states with murder rates above 10:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = MurderRate)) + geom_polygon(color = "black") + scale_fill_gradient(low = "blue", high = "red", guide = "legend")

# Plot the gunowner on our map of the United States:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = GunOwnership)) + geom_polygon(color = "black") + scale_fill_gradient(low = "blue", high = "red", guide = "legend")